package OliveXML;

# Copyright (c) 2005, Shawn Boyette
# All rights reserved.
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation files
# (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge,
# publish, distribute, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, provided
# that the above copyright notice(s) and this permission notice appear
# in all copies of the Software and that both the above copyright
# notice(s) and this permission notice appear in supporting
# documentation.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR
# ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY
# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
# OF THIS SOFTWARE.
#
# Except as contained in this notice, the name of a copyright holder
# shall not be used in advertising or otherwise to promote the sale,
# use or other dealings in this Software without prior written
# authorization of the copyright holder.

require Exporter;
use warnings;
use strict;
use Digest::MD5 qw(md5_hex);
use Encode qw(encode_utf8);
use OliveMisc;
use XML::Simple;

our @ISA       = qw(Exporter);
our @EXPORT    = qw( &parsexml );


#-------------------------------------------------------------

sub parsexml {
    my ($cui,$nick) = @_;
    my $c = $cui->userdata->{c};
    my $d = $cui->userdata->{feeds};
    my $l = $cui->userdata->{log};
    my $f = '';

    # actually parse the feed
    eval { $f = XMLin("$d/$nick"); };
    if ($@) {
        $l->log( level => "warning", message => "$nick: XML parse error $@");
        return;
    }

    my %data = ();
    my $md5s = 0;
    
    if ($f->{xmlns} && $f->{xmlns} =~ /atom/) {
        # We don't do ATOM right now.
        errorbox($cui,"Olive does not handle ATOM feeds. This feed will be marked dormant.",$nick);
        $c->{feeds}{$nick}{dormant} = 1;
    } elsif ($f->{version}) {
        # feed is RSS(?) ##################################################
        $data{title} = $f->{channel}{title};
        $data{ttl}   = 3600;
        $data{ttl}   = $f->{channel}{ttl} * 60 if ($f->{channel}{ttl});
        # iterate over the feed items, get back a list of md5sums
        $md5s = parseitems($f->{channel}{item},$cui,$nick);
    } elsif ($f->{channel}{'dc:date'} or $f->{channel}{'rdf:about'}) {
        # feed is RDF 0.9/1.0
        $data{title} = $f->{channel}{title};

        # hash out TTL
        $data{ttl}   = 86400;
        if ($f->{channel}{'syn:updatePeriod'}) {
            my $freq = $f->{channel}{'syn:updatePeriod'};
            $data{ttl} = 3600           if ($freq eq 'hourly');
            $data{ttl} = 86400          if ($freq eq 'daily');
            $data{ttl} = 86400 * 7      if ($freq eq 'weekly');
            $data{ttl} = 86400 * 7 * 30 if ($freq eq 'monthly');
        }
        if ($f->{channel}{'syn:updateFrequency'}) {
            $data{ttl} = int( $data{ttl} / $f->{channel}{'syn:updateFrequency'} );
        }
        # iterate over the feed items, get back a list of md5sums
        $md5s = parseitems($f->{item},$cui,$nick);
    } else {
        $c->{feeds}{$nick}{failures}++;
        if ($c->{feeds}{$nick}{failures}++ == 5) {
            my $disp= $c->{feeds}{$nick}{disp};
            my $msg = "There have been problems parsing ${disp}. " 
                . "Either Olive doesn't know how to handle its format, or XML "
                . "parsing errors have occurred repeatedly.\n\n"
                . "This feed will now be marked as dormant.";
            errorbox($cui,$msg);
            $c->{feeds}{$nick}{dormant} = 1;
        }
    }

    # Undo any potential failure count
    $c->{feeds}{$nick}{failures} = 0;

    # catch illegal TTL values
    $data{ttl} = 3600 if (!defined $data{ttl} or $data{ttl} < 1 or $data{ttl} =~ /\D/);

    return \%data,$md5s;
}

#-------------------------------------------------------------

sub parseitems {
    my ($items,$cui,$nick) = @_;

    return unless $items; # if the feed is empty, do nothing.

    my $c = $cui->userdata->{c};
    my $d = $cui->userdata->{dbh};
    my $l = $cui->userdata->{log};
    my $w = $cui->userdata->{wins};
    my %md5s = ();
    my $i = 1;
    my $j = 0;

    # fix single-entry feeds (should be ARRAY not HASH)
    if (ref($items) eq 'HASH') {
        my $tmpitems = $items;
        undef $items;
        $OliveXML::items->[0] = ();
        my $k = my $v = 0;
        while (($k,$v) = each %{$tmpitems}) {
            $items->[0]{$k} = $v;
        }
    }

    my $count = @{$items};

    foreach my $item (@{$items}) {
        $cui->userdata->{sid} = 0;
        my $links = [];
        my @q = qw(NULL NULL NULL NULL NULL NULL NULL NULL NULL);
        $q[0] = $d->quote($nick);

        # first, build string for md5sum
        # FIXME there is definitely a better way/time to do this...
        my $md5seed = join('',$nick, ($item->{title} || 'NULL'), ($item->{description} || 'NULL'));
        $q[2] = md5_hex(encode_utf8($md5seed)); # which must be wrapped in UTF-8 for chars above 255
        $md5s{$q[2]} = 1;                       # store a copy in %md5s for later
        $q[2] = $d->quote($q[2]);
        # check for matching MD5s (existing entries) and insert if none found
        my ($xid) = $d->selectrow_array("SELECT id FROM stories WHERE nick = $q[0] and md5 = $q[2]");
        unless ($xid) {
            my $statement = "INSERT INTO stories VALUES (NULL, NULL, NULL, "
                . "$q[2], NULL, NULL, 0, NULL, NULL, NULL)";
            my $sth = $d->prepare($statement);
            $sth->execute;
            $cui->userdata->{sid} = $d->last_insert_id(undef,undef,'stories','id');
            $j++;
        }

        if ($item->{pubDate}) {
            # turn the RFC-822 date into seconds-since-epoch
            $q[1] = rfc822($item->{pubDate});
        } elsif ($item->{'dc:date'}) {
            # turn the ISO-8601 date into seconds-since-epoch
            $q[1] = iso8601($item->{'dc:date'});
        } else {
            # no time given. set to epoch.
            $q[1] = 0;
        }

        $q[3] = 0; # 'read' flag
        $q[4] = 1; # 'new' flag

        if ($item->{link}) {
            $item->{link} =~ s/^\s+//;
            $item->{link} =~ s/\s+$//;
            $q[5] = $d->quote($item->{link}) if $item->{link};
        }
        if ($item->{title}) {
            $item->{title} =~ s/^\s+//;
            $item->{title} =~ s/\s+$//;
            ($q[6]) = HTMangLe($item->{title}) if $item->{title};
            $q[6] = $d->quote($q[6]) 
        }

        # mogrify main text
        if((ref($item->{description})) eq "HASH") { 
            $item->{description} = ''; 
        } 
        if ($item->{description}) {
            ($q[7],$links) = HTMangLe($item->{description});
            $q[7] = $d->quote($q[7]);
        }

        # poke the rest of the data into the db if this story's not a dupe 
        if ($cui->userdata->{sid}) {
            my $statement = "UPDATE stories SET nick = $q[0], timestamp = $q[1], "
                . "read = $q[3], new = $q[4], link = $q[5], title = $q[6], desc = $q[7] "
                . 'WHERE id = ' . $cui->userdata->{sid};
            my $sth = $d->prepare($statement);
            $sth->execute;

            my $i = 0;
            foreach my $link (@{$links}) { # ...and the internal links, if any
                $statement = 'INSERT INTO links VALUES (' . $cui->userdata->{sid} .
                    ", $i, " . $d->quote($link->{link}) . ', ' . 
                    $d->quote($link->{desc}) . ')';
                $sth = $d->prepare($statement);
                $sth->execute;
                $i++;
            }
        }
        percent($i,$count,$c->{feeds}{$nick}{disp},$w);
        $i++;
    }

    $l->log( level => "info", message => "$nick: $j new (of $count) stories inserted");
    return \%md5s;
}

#-------------------------------------------------------------

sub percent {
    my ($i,$count,$nick,$w) = @_;

    my $percent = int(($i / $count) * 100);

    my $msg = "Parsing $nick [ $i of $count ($percent\%)]";
    $w->{news}{ftr1}->text($msg . ' ' x ($w->{dim}[0] - length($msg)));
    $w->{news}{ftr1}->draw;
}

#-------------------------------------------------------------

sub HTMangLe {
    my $text = shift;
    Encode::_utf8_on($text);

    # strip story URLs and store them
    my @links = ();
    my $i = 0;
    while ($text =~ m|<a href=["'][^'"]+?["'].*?>.*?</a>|) {
        $text =~ s|<a href=["']([^'"]+?)["'].*?>(.*?)</a>|[$2][$i]|m;
        $links[$i] = { link => $1,
                       desc => $2,
                     };
        $links[$i]->{link} =~ s/^\s+//;
        $links[$i]->{link} =~ s/\s+$//;
        $links[$i]->{link} =~ s/\s{2,}/ /;
        $links[$i]->{desc} =~ s/^\s+//;
        $links[$i]->{desc} =~ s/\s+$//;
        $links[$i]->{desc} =~ s/\s{2,}/ /;
        $i++;
    }

    # reformatting
    $text =~ s#</?(?:strong|b)>#\*#mg; # strong to *strong*
    $text =~ s#</?(?:em|i)>#_#mg;      # em to _em_
    $text =~ s#</?cite>#|#mg;          # cite to |cite|
    $text =~ s/<img.*?src=["']([^'"]+?)["'].*?>/[[img: $1 ]]/mg;
    $text =~ s|</a>||mg;
    $text =~ s/\s{2,}/ /mg;

    # newline mangling
    $text =~ s|<h1>(.+?)</h1>|\n\n>>> $1\n\n|mg;
    $text =~ s|<h\d>(.+?)</h\d>|\n\n-- $1\n\n|mg;
    $text =~ s|</?p>|\n|mg;
    $text =~ s|</?pre>|\n|mg;
    $text =~ s|</?blockquote.*?>|\n|mg;
    $text =~ s|<br\s?/?>|\n|mg;
    $text =~ s/<li>\s*/\n  * /mg;
    $text =~ s/\n{2,}  \* /\n  * /mg;
    $text =~ s/\n{3,}/\n\n/mg;
    $text =~ s/^\n//;

    # strip whatever tags remain
    $text =~ s|</?\w.*?>||mg;

    # entity handling
    # thanks to TorgoX's unicode sliderule
    $text =~ s|&gt;|>|mg;
    $text =~ s|&lt;|<|mg;
    $text =~ s|\x{2018}|'|mg;
    $text =~ s|\x{2019}|'|mg;
    $text =~ s|\x{201c}|"|mg;
    $text =~ s|\x{201d}|"|mg;
    $text =~ s|\x{2026}|...|mg;
    $text =~ s|&#0?36;|\$|mg;
    $text =~ s|&#0?38;|&|mg;
    $text =~ s|&#0?39;|'|mg;
    $text =~ s|&#8211;|--|mg;
    $text =~ s|&#8212;|--|mg;
    $text =~ s|&#8216;|'|mg;
    $text =~ s|&#8217;|'|mg;
    $text =~ s|&#8220;|"|mg;
    $text =~ s|&#8221;|"|mg;
    $text =~ s|&amp;|&|mg;
    $text =~ s|&copy;|(c)|mg;
    $text =~ s|&pound;|L|mg;
    $text =~ s|&quot;|"|mg;
    $text =~ s|&reg;|(r)|mg;
    $text =~ s|&squot;|'|mg;
    $text =~ s|&trade;|[tm]|mg;

    return $text,\@links;
}

1;
